{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tag2id, id2tag = {}, {}  # maps tag to id . tag2id: {\"VB\": 0, \"NNP\":1,..} , id2tag: {0: \"VB\", 1: \"NNP\"....}\n",
    "word2id, id2word = {}, {} # maps word to id\n",
    "\n",
    "for line in open('traindata.txt'):\n",
    "    items = line.split('/')\n",
    "    word, tag = items[0], items[1].rstrip()  # 抽取每一行里的单词和词性\n",
    "    \n",
    "    if word not in word2id:\n",
    "        word2id[word] = len(word2id)\n",
    "        id2word[len(id2word)] = word\n",
    "    if tag not in tag2id:\n",
    "        tag2id[tag] = len(tag2id)\n",
    "        id2tag[len(id2tag)] = tag\n",
    "\n",
    "M = len(word2id)  # M: 词典的大小、# of words in dictionary\n",
    "N = len(tag2id)   # N: 词性的种类个数  # of tags in tag set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 构建 pi, A, B\n",
    "# N: # of tags M: # of words in dictionary\n",
    "import numpy as np\n",
    "pi = np.zeros(N)   # 每个词性出现在句子中第一个位置的概率,  N: # of tags  pi[i]: tag i出现在句子中第一个位置的概率\n",
    "A = np.zeros((N, M)) # A[i][j]: 给定tag i, 出现单词j的概率。 N: # of tags M: # of words in dictionary\n",
    "B = np.zeros((N,N))  # B[i][j]: 之前的状态是i, 之后转换成转态j的概率 N: # of tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "prev_tag = \"\"\n",
    "for line in open('traindata.txt'):\n",
    "    items = line.split('/')\n",
    "    wordId, tagId = word2id[items[0]], tag2id[items[1].rstrip()]\n",
    "    if prev_tag == \"\":  # 这意味着是句子的开始\n",
    "        pi[tagId] += 1\n",
    "        A[tagId][wordId] += 1\n",
    "    else:  # 如果不是句子的开头\n",
    "        A[tagId][wordId] += 1\n",
    "        B[tag2id[prev_tag]][tagId] += 1\n",
    "    \n",
    "    if items[0] == \".\":\n",
    "        prev_tag = \"\"\n",
    "    else:\n",
    "        prev_tag = items[1].rstrip()\n",
    "\n",
    "# normalize\n",
    "pi = pi/sum(pi)\n",
    "for i in range(N):\n",
    "    A[i] /= sum(A[i])\n",
    "    B[i] /= sum(B[i])\n",
    "\n",
    "#  到此为止计算完了模型的所有的参数： pi, A, B : 模型的参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def log(v):\n",
    "    if v == 0:\n",
    "        return np.log(v+0.000001)\n",
    "    return np.log(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def viterbi(x, pi, A, B):\n",
    "    \"\"\"\n",
    "    x: user input string/sentence: x: \"I like playing soccer\"\n",
    "    pi: initial probability of tags\n",
    "    A: 给定tag, 每个单词出现的概率\n",
    "    B: tag之间的转移概率\n",
    "    \"\"\"\n",
    "    x = [word2id[word] for word in x.split(\" \")]  # x: [4521, 412, 542 ..]\n",
    "    T = len(x)\n",
    "    \n",
    "    dp = np.zeros((T,N))  # dp[i][j]: w1...wi, 假设wi的tag是第j个tag\n",
    "    ptr = np.array([[0 for x in range(N)] for y in range(T)] ) # T*N\n",
    "    # TODO: ptr = np.zeros((T,N), dtype=int)\n",
    "    \n",
    "    for j in range(N): # basecase for DP算法\n",
    "        dp[0][j] = log(pi[j]) + log(A[j][x[0]])\n",
    "    \n",
    "    for i in range(1,T): # 每个单词\n",
    "        for j in range(N):  # 每个词性\n",
    "            # TODO: 以下几行代码可以写成一行（vectorize的操作， 会使得效率变高）\n",
    "            dp[i][j] = -9999999\n",
    "            for k in range(N): # 从每一个k可以到达j\n",
    "                score = dp[i-1][k] + log(B[k][j]) + log(A[j][x[i]])\n",
    "                if score > dp[i][j]:\n",
    "                    dp[i][j] = score\n",
    "                    ptr[i][j] = k\n",
    "    \n",
    "    # decoding: 把最好的tag sequence 打印出来\n",
    "    best_seq = [0]*T  # best_seq = [1,5,2,23,4,...]  \n",
    "    # step1: 找出对应于最后一个单词的词性\n",
    "    best_seq[T-1] = np.argmax(dp[T-1])\n",
    "    \n",
    "    # step2: 通过从后到前的循环来依次求出每个单词的词性\n",
    "    for i in range(T-2, -1, -1): # T-2, T-1,... 1, 0\n",
    "        best_seq[i] = ptr[i+1][best_seq[i+1]]\n",
    "        \n",
    "    # 到目前为止, best_seq存放了对应于x的 词性序列\n",
    "    for i in range(len(best_seq)):\n",
    "        print (id2tag[best_seq[i]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NNP\n",
      "NNP\n",
      "NN\n",
      ",\n",
      "NN\n",
      "NN\n",
      "CC\n",
      "NNS\n",
      "IN\n",
      "DT\n",
      "NNS\n",
      "VBN\n",
      "IN\n",
      "DT\n",
      "NN\n"
     ]
    }
   ],
   "source": [
    "x = \"Social Security number , passport number and details about the services provided for the payment\"\n",
    "viterbi(x, pi, A, B)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
